package com.play;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang.StringUtils;
import org.xml.sax.SAXException;

import edu.uci.ics.crawler4j.parser.ExtractedUrlAnchorPair;
import edu.uci.ics.crawler4j.parser.HtmlContentHandler;

public class YintaiHandler extends HtmlContentHandler{
private boolean isContent = false;
	
	@Override
	protected void addOutgoingUrls(ExtractedUrlAnchorPair url) {
		if(isContent){
			/*Pattern p = Pattern.compile("(\\S+-\\d+-)"); 
			Matcher m = p.matcher(url.getHref()); 
			m.find();
			String bigurl = m.group(0);
			String type = ReadUrl.rp.get(bigurl);*/
			super.addOutgoingUrls(url);
		}
	}
	
	@Override
	public void characters(char[] ch, int start, int length)
			throws SAXException {
		String content = new String(ch, start, length);
		if(StringUtils.isBlank(content)){
			return;
		}
		if (content.contains("浏览历史")) {
			isContent = true;
		}
		if (content.contains("上一页")) {
			isContent = false;
		}
		if(isContent){
			bodyText.append(ch, start, length);

			if (anchorFlag) {
				anchorText.append(new String(ch, start, length));
			}
		}
	}
	
	
}
